package wordsegementation;

import org.apache.commons.lang3.StringUtils;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.StringTokenizer;

public class TernarySearch {

    public static final class TSTNode {
        public String data;
        protected TSTNode leftNode;
        protected TSTNode midNode;
        protected TSTNode rightNode;

        protected char spliter;

        public TSTNode(char key) {
            this.spliter = key;
        }
    }

    public static TSTNode rootNode;

    private FSTNumberOrEn fstNumberOrEn = new FSTNumberOrEn();

    static {
        try {
            FileInputStream file = new FileInputStream("./SDIC.txt");
            BufferedReader in = new BufferedReader(new InputStreamReader(file, "GBK"));
            String line;
            while ((line = in.readLine()) != null) {
                StringTokenizer st = new StringTokenizer(line, "\t");
                String key = st.nextToken();

                TSTNode currentNode = addWord(key);
                currentNode.data = key;
            }
            in.close();
            file.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * 构建三叉Tire树的方法
     * <p>
     * 返回该字符串最后一个字符指向的节点；
     * 这样做的好处是，最后一个字符的标志节点，可以存放这个字符串。
     * TSTNode.data如果不为空表示这节点可以表示一个字符串，如果为空该节点肯定还有匹配完毕，还可以继续匹配，符合最长字典字符数组匹配规则。
     */
    public static TSTNode addWord(String key) {
        if (StringUtils.isBlank(key)) {
            return null;
        }
        int charIndex = 0;
        if (rootNode == null) {
            rootNode = new TSTNode(key.charAt(charIndex));
        }

        TSTNode currentNode = rootNode;

        while (true) {
            int charComp = key.charAt(charIndex) - currentNode.spliter;

            if (charComp == 0) {
                charIndex++;
                if (charIndex == key.length()) {
                    return currentNode;
                }
                if (currentNode.midNode == null) {
                    currentNode.midNode = new TSTNode(key.charAt(charIndex));
                }
                currentNode = currentNode.midNode;
            } else if (charComp < 0) {
                if (currentNode.leftNode == null) {
                    currentNode.leftNode = new TSTNode(key.charAt(charIndex));
                }
                currentNode = currentNode.leftNode;
            } else {
                if (currentNode.rightNode == null) {
                    currentNode.rightNode = new TSTNode(key.charAt(charIndex));
                }
                currentNode = currentNode.rightNode;
            }
        }
    }

    private String text;
    private int offset;


    public TernarySearch(String text) {
        //替换中文语句中的一些标点符号
        this.text = text.replaceAll("[\\t\\r\\n\\、\\，\\。\\：\\；\\“\\‘\\”\\【\\】\\『\\』\\|\\=\\+\\-\\—\\—\\（\\）\\*\\…\\…\\！\\~\\·\\《\\》\\？\\/\\?\\<\\>\\,\\.\\;\\:\\'\\\\\\\"\\[\\]\\{\\}\\_\\)\\(\\^\\!\\`]", " ");
        this.offset = 0;
    }

    /**
     * 根据构造方法传入的中文语句，对其进行分词
     * 采用算法：最长中文词典词组匹配算法
     */
    public String next() {
        if (StringUtils.isBlank(text) || offset >= text.length() || rootNode == null) {
            return null;
        }

        int charIndex = offset;
        TSTNode currentNode = rootNode;
        String ret = null;

        //处理未登陆串
        int index = fstNumberOrEn.matchNumberOrEn(text, offset);
        if (index != 0 && index != offset && offset < text.length()) {
            ret = text.substring(offset, index);
            offset = index;
            return ret;
        }

        while (true) {
            if (currentNode == null) {
                if (StringUtils.isBlank(ret)) {
                    ret = text.substring(offset, offset + 1);
                    offset++;
                }
                return ret;
            }

            //判断字符与在当前节点的左子树，中子树还是右子树
            int charComp = text.charAt(charIndex) - currentNode.spliter;

            if (charComp == 0) {
                charIndex++;

                if (StringUtils.isNotBlank(currentNode.data)) {
                    ret = currentNode.data;
                    offset = charIndex;
                }
                if (text.length() == charIndex) {
                    return ret;
                }
                currentNode = currentNode.midNode;
            } else if (charComp < 0) {
                currentNode = currentNode.leftNode;
            } else {
                currentNode = currentNode.rightNode;
            }
        }
    }

    public static void main(String[] args) {
        TernarySearch ternarySearch = new TernarySearch("文件指出，2020年是全面建成小康社会目标实现之年，是全面打赢脱贫攻坚战收官之年。党中央认为，完成上述两大目标任务，脱贫攻坚最后堡垒必须攻克，全面小康“三农”领域突出短板必须补上。");

        String word = null;
        do {
            word = ternarySearch.next();
            if(StringUtils.isBlank(word)) {
                continue;
            }
            System.out.println(word);
        } while (word != null);
    }

}
